2021 ONE-Seq Focused Analysis Report
library(tidyverse)
library(here)
library(plotly)
library(scales)
library(tidyr)
library(kableExtra)
library(tibble)PROJECT INFORMATION & METADATA
For more detailed project information please see accompanying report; “Metadata_Report.Rmd”
Information contained:
- Project goals & summary
- Sequencing metadata
- File metadata
- ONE-Seq pipeline run metadata
For additional, less-structured analyses please see accompanying report: “Analysis_Report.Rmd”.
Sample Overview
cat(readLines(here("data/ONE-Seq_Run_Data/README.md")), sep = '\n')## The primary output files (scores.tsv and QC.csv) from ONEseq are copied here and renamed according to their naming schema.
##
## *XXX00m*: XXX = wet-lab prep location, sequencing location, bioinformatics location, dataset number, *m*anually prepared
##
## *N* = NIST *J* = Joung Lab
##
## | Name | Date | Description |
## |--------|---------|-------------------------------------------------------------------------------------------------------------|
## | JJJ01m | 2021-05 | UMI length of 8, SeQure data for target sites, run by Martin |
## | JJN01m | NA | UMI length of 8, SeQure data for target sites, run by Sierra |
## | JJN02m | NA | Replicate 1 of SeQure's HBB data, UMI = 8 |
## | JJN03m | NA | Replicate 2 of SeQure's HBB data, UMI = 8 |
## | NNN01m | 2021-05 | Natasha's prep of samples with UMI=11 but run with UMI set to 8 - 'test experiments' |
## | NNN02m | 2021-05 | Same wet-lab samples as NNN01m but changed UMI param from 8 to 11 'replicate experiments - R1' |
## | NNN03m | 2021-07 | Replicate 1 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1' |
## | NNN04m | 2021-07 | Replicate 2 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1' |
## | NNN05m | 2021-07 | Replicate 3 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1' |
## | NNN06m | 2021-07 | Replicate 1 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |
## | NNN07m | 2021-07 | Replicate 2 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |
## | NNN08m | 2021-07 | Replicate 3 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |
LOAD DATA
Load & mung data
TARGET SITES: EMX1, FANCF, RNF2
########################################################################
# MAIN OUTPUT
########################################################################
## get list of files in results directory
scores_file_list <- as.list(list.files(path = here("data/report_input"),
pattern = "scores.tsv",
include.dirs = TRUE, full.names = TRUE,
recursive = TRUE))
## strip the path portion of the file names
scores_file_names <- str_remove(scores_file_list, paste0(here("data/report_input"),"/"))
## organize the file list to read them into a df
scores_df_lst <- set_names(scores_file_list, scores_file_names)
## make lists into a data frame
scores_metadata_df <- tibble(scores_file = unlist(scores_file_names)) %>%
separate(scores_file, c("lab","target_site"),
sep = "_", remove = FALSE) %>%
mutate(target_site = str_remove(target_site, ".scores.tsv"))
## read in files & annotate with the metadata
scores_df <- scores_df_lst %>%
map_dfr(read_tsv, .id = "scores_file") %>%
## if normalized proto AND normalized pam = 0 remove because
## no signal to support that barcode/off-target
filter(normalized_counts_pam + normalized_counts_proto > 0) %>%
left_join(scores_metadata_df)
########################################################################
# SUMMARY OUTPUT
########################################################################
## get list of files in results directory
qc_file_list <- as.list(list.files(path = here("data/report_input"),
pattern = "QC.csv",
include.dirs = TRUE, full.names = TRUE,
recursive = TRUE))
## strip the path portion of the file names
qc_file_names <- str_remove(qc_file_list, paste0(here("data/report_input"),"/"))
## organize the file list to read them into a df
qc_df_lst <- set_names(qc_file_list, qc_file_names)
## make lists into a data frame
qc_metadata_df <- tibble(qc_file = unlist(qc_file_names)) %>%
separate(qc_file, c("lab","target_site"),
sep = "_", remove = FALSE) %>%
mutate(target_site = str_remove(target_site, ".QC.csv"))
## read in files & annotate with the metadata
qc_df <- qc_df_lst %>%
map_df(read_csv, col_names = c("var","number_of_seqs"), .id = "sample") %>%
mutate(var= str_remove(var, "number_of_")) %>%
mutate(proto_pam = str_extract(var, "pam")) %>% ## detect pam string and place pam in col
mutate(proto_pam = replace_na(proto_pam, "proto")) %>% ## all NAs in this column will be "proto"
mutate(var= str_remove(var, "_pam")) %>% ## remove strings to just the variables
mutate(var= str_remove(var, "_proto")) %>%
pivot_wider(names_from = proto_pam, values_from = number_of_seqs, values_fill = NA) %>%
separate(sample,c("lab","target_site"), sep = "_", remove = FALSE) %>%
select(-sample) %>%
mutate(target_site = str_remove(target_site, ".QC.csv"))
########################################################################
# VIKRAM'S QC OUTPUT
########################################################################
################## QC SCORES FILE
## get list of files in results directory
vikramqc_file_list <- as.list(list.files(path = here("data/Vikram_QC/report_input"),
pattern = "QC.csv",
include.dirs = TRUE, full.names = TRUE,
recursive = TRUE))
## strip the path portion of the file names
vikramqc_file_names <- str_remove(vikramqc_file_list, paste0(here("data/Vikram_QC/report_input"),"/"))
## organize the file list to read them into a df
vikramqc_df_lst <- set_names(vikramqc_file_list, vikramqc_file_names)
## make lists into a data frame
vikramqc_metadata_df <- tibble(vikramqc_file = unlist(vikramqc_file_names)) %>%
separate(vikramqc_file, c("lab","target_site"),
sep = "_", remove = FALSE) %>%
mutate(target_site = str_remove(target_site, "QC.csv"))
## read in files & annotate with the metadata
vikramqc_df <- vikramqc_df_lst %>%
map_dfr(read_csv, .id = "vikramqc_file") %>%
left_join(vikramqc_metadata_df)
################## QC SUMMARY FILE
## get list of files in results directory
vikramqc_summ_file_list <- as.list(list.files(path = here("data/Vikram_QC/report_input"),
pattern = "QC.tsv",
include.dirs = TRUE, full.names = TRUE,
recursive = TRUE))
## strip the path portion of the file names
vikramqc_summ_file_names <- str_remove(vikramqc_summ_file_list, paste0(here("data/Vikram_QC/report_input"),"/"))
## organize the file list to read them into a df
vikramqc_summ_df_lst <- set_names(vikramqc_summ_file_list, vikramqc_summ_file_names)
## make lists into a data frame
vikramqc_summ_metadata_df <- tibble(vikramqc_summ_file = unlist(vikramqc_summ_file_names)) %>%
separate(vikramqc_summ_file, c("lab","target_site"),
sep = "_", remove = FALSE) %>%
mutate(target_site = str_remove(target_site, ".QC.tsv"))
## read in files & annotate with the metadata
vikramqc_summ_df <- vikramqc_summ_df_lst %>%
map_df(read_tsv, .id = "sample") %>%
separate(sample,c("lab","target_site"), sep = "_", remove = TRUE)
########################################################################
# CALCULATING RAW READ COUNTS
########################################################################
# EQUATION FOR RAW READS
## on target reads = 1/smallest non-zero normalized read count
## per row raw read count = on target read * that row's normalized read count
scores_df <- scores_df %>%
filter(!lab == "NNN01m") %>%
group_by(lab, target_site) %>%
## note the smalled normalized counts for pam/proto to be used in calculation
mutate(min_pam_norm_count = min(normalized_counts_pam[normalized_counts_pam>0]),
min_proto_norm_count = min(normalized_counts_proto[normalized_counts_proto>0])) %>%
mutate(pam_on_target_reads = 1/min_pam_norm_count,
proto_on_target_reads = 1/min_proto_norm_count) %>%
rowwise() %>%
mutate(pam_raw_read_count = normalized_counts_pam * pam_on_target_reads,
proto_raw_read_count = normalized_counts_proto * proto_on_target_reads) %>%
select(-min_pam_norm_count, -min_proto_norm_count)
########################################################################
# COLOR PALETTE
########################################################################
## establish color palette for plots
cbPalette <- c("#000000",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7",
"#999999",
"#20F03B",
"#FEC44F",
"#D95F0E",
"#756BB1",
"#FF8A33",
"#D7BE07",
"#A807D7")
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT
########################################################################
rm(qc_df_lst)
rm(qc_file_list)
rm(scores_df_lst)
rm(scores_file_list)
rm(vikramqc_df_lst)
rm(vikramqc_file_list)
rm(vikramqc_summ_df_lst)
rm(vikramqc_summ_file_list)R1 REPLICATE COMPARISONS
Create a dataframe with R1 data only to use in this section. (NNN03m, NNN04m, NNN05m)
########################################################################
# DEFINE LABS
########################################################################
## create R1 df: NNN02, NNN03, NNN04
labs <- c("NNN03m", "NNN04m", "NNN05m")
########################################################################
# CREATE R1 SCORES DF
########################################################################
r1_scores_df <- scores_df %>%
filter(lab %in% labs)
########################################################################
# CREATE R1 QC DF
########################################################################
## qc df
r1_qc_df <- qc_df %>%
filter(lab %in% labs)TOTAL NUMBER OF BARCODES
The total number of barcodes with at least 1 read for each triplicate & target site.
########################################################################
# CREATE MINIMAL DF THAT DESCRIBES TOTAL BARCODES
########################################################################
total_barcodes_plot_df <- r1_scores_df %>%
group_by(target_site, lab) %>%
count(target_site, lab, name = "total_barcodes") %>%
arrange(target_site, lab)
########################################################################
# PLOT
########################################################################
ggplotly(ggplot(total_barcodes_plot_df) +
geom_bar(aes(x = target_site, y = total_barcodes, fill = lab),
# dodge prserves vertical position, single groups bars of same metric together
# stat = identity skips aggregation of values - I provide values
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) + # add commas to axis label numbers
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Barcodes",
fill = "Lab", main = "Total Number of Barcodes") +
scale_fill_manual(values=cbPalette))########################################################################
# TABLE
########################################################################
(total_barcodes_plot_df %>%
pivot_wider(names_from = "lab",
values_from = "total_barcodes",
values_fill = 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN03m, NNN04m, NNN05m)),
SD = sd(c(NNN03m, NNN04m, NNN05m)),
CV = SD / AVG) %>%
arrange(desc(CV)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | NNN03m | NNN04m | NNN05m | AVG | SD | CV |
|---|---|---|---|---|---|---|
| FANCF | 17127 | 10916 | 9347 | 12463.33 | 4114.338 | 0.3301154 |
| RNF2 | 14479 | 11270 | 14453 | 13400.67 | 1845.257 | 0.1376989 |
| EMX1 | 23447 | 27021 | 25265 | 25244.33 | 1787.090 | 0.0707917 |
########################################################################
# REMOVE ENV VARS
########################################################################
rm(total_barcodes_plot_df)QC READ COUNTS
From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category, plotted for each target site.
####################################################################
## FUNCTION TO MAKE PLOTS
####################################################################
make_qc_reads_plot <- function(df, targetsite){
plot_df <- df %>%
filter(target_site == {{targetsite}}) %>%
pivot_longer(names_to = "proto_pam",
values_to = "read_count",
cols = c("proto","pam")) %>%
group_by(target_site)
## plot
ggplotly(ggplot(plot_df) +
geom_bar(aes(x = var, y = read_count, fill = lab),
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
facet_wrap(~proto_pam) +
labs(x = "Metric", y = "Read Count",
fill = "Lab", title = {{targetsite}}) + scale_fill_manual(values=cbPalette))
}
####################################################################
## FUNCTION TO MAKE TABLE
####################################################################
make_qc_table <- function(df, targetsite){
labs <- df$labs
(plot_df <- df %>%
filter(target_site == {{targetsite}}) %>%
pivot_longer(names_to = "proto_pam", values_to = "read_count", cols = c("proto","pam")) %>%
group_by(target_site) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
}EMX1
make_qc_reads_plot(r1_qc_df, "EMX1")make_qc_table(r1_qc_df, "EMX1")| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN03m | EMX1 | seqs | proto | 489072 |
| NNN03m | EMX1 | seqs | pam | 437017 |
| NNN03m | EMX1 | seqs_with_primer | proto | 466871 |
| NNN03m | EMX1 | seqs_with_primer | pam | 418686 |
| NNN03m | EMX1 | seqs_with_umi | proto | 386126 |
| NNN03m | EMX1 | seqs_with_umi | pam | 392840 |
| NNN03m | EMX1 | seqs_with_libbarcode | proto | 343483 |
| NNN03m | EMX1 | seqs_with_libbarcode | pam | 286664 |
| NNN03m | EMX1 | seqs_with_target | proto | 420164 |
| NNN03m | EMX1 | seqs_with_target | pam | 314442 |
| NNN03m | EMX1 | seqs_passing_all_filters | proto | 324280 |
| NNN03m | EMX1 | seqs_passing_all_filters | pam | 271902 |
| NNN04m | EMX1 | seqs | proto | 770000 |
| NNN04m | EMX1 | seqs | pam | 453853 |
| NNN04m | EMX1 | seqs_with_primer | proto | 737277 |
| NNN04m | EMX1 | seqs_with_primer | pam | 434914 |
| NNN04m | EMX1 | seqs_with_umi | proto | 605297 |
| NNN04m | EMX1 | seqs_with_umi | pam | 407884 |
| NNN04m | EMX1 | seqs_with_libbarcode | proto | 534600 |
| NNN04m | EMX1 | seqs_with_libbarcode | pam | 300442 |
| NNN04m | EMX1 | seqs_with_target | proto | 655693 |
| NNN04m | EMX1 | seqs_with_target | pam | 329496 |
| NNN04m | EMX1 | seqs_passing_all_filters | proto | 506708 |
| NNN04m | EMX1 | seqs_passing_all_filters | pam | 284894 |
| NNN05m | EMX1 | seqs | proto | 676250 |
| NNN05m | EMX1 | seqs | pam | 490001 |
| NNN05m | EMX1 | seqs_with_primer | proto | 644761 |
| NNN05m | EMX1 | seqs_with_primer | pam | 469711 |
| NNN05m | EMX1 | seqs_with_umi | proto | 528627 |
| NNN05m | EMX1 | seqs_with_umi | pam | 439024 |
| NNN05m | EMX1 | seqs_with_libbarcode | proto | 464578 |
| NNN05m | EMX1 | seqs_with_libbarcode | pam | 312312 |
| NNN05m | EMX1 | seqs_with_target | proto | 572272 |
| NNN05m | EMX1 | seqs_with_target | pam | 343725 |
| NNN05m | EMX1 | seqs_passing_all_filters | proto | 438149 |
| NNN05m | EMX1 | seqs_passing_all_filters | pam | 296486 |
FANCF
make_qc_reads_plot(r1_qc_df, "FANCF")make_qc_table(r1_qc_df, "FANCF")| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN03m | FANCF | seqs | proto | 560255 |
| NNN03m | FANCF | seqs | pam | 781033 |
| NNN03m | FANCF | seqs_with_primer | proto | 536202 |
| NNN03m | FANCF | seqs_with_primer | pam | 747295 |
| NNN03m | FANCF | seqs_with_umi | proto | 449942 |
| NNN03m | FANCF | seqs_with_umi | pam | 702493 |
| NNN03m | FANCF | seqs_with_libbarcode | proto | 404072 |
| NNN03m | FANCF | seqs_with_libbarcode | pam | 531995 |
| NNN03m | FANCF | seqs_with_target | proto | 489640 |
| NNN03m | FANCF | seqs_with_target | pam | 579930 |
| NNN03m | FANCF | seqs_passing_all_filters | proto | 383699 |
| NNN03m | FANCF | seqs_passing_all_filters | pam | 503640 |
| NNN04m | FANCF | seqs | proto | 269079 |
| NNN04m | FANCF | seqs | pam | 419329 |
| NNN04m | FANCF | seqs_with_primer | proto | 255912 |
| NNN04m | FANCF | seqs_with_primer | pam | 401859 |
| NNN04m | FANCF | seqs_with_umi | proto | 216768 |
| NNN04m | FANCF | seqs_with_umi | pam | 378782 |
| NNN04m | FANCF | seqs_with_libbarcode | proto | 196217 |
| NNN04m | FANCF | seqs_with_libbarcode | pam | 292696 |
| NNN04m | FANCF | seqs_with_target | proto | 237303 |
| NNN04m | FANCF | seqs_with_target | pam | 318702 |
| NNN04m | FANCF | seqs_passing_all_filters | proto | 184659 |
| NNN04m | FANCF | seqs_passing_all_filters | pam | 277495 |
| NNN05m | FANCF | seqs | proto | 241539 |
| NNN05m | FANCF | seqs | pam | 372237 |
| NNN05m | FANCF | seqs_with_primer | proto | 228055 |
| NNN05m | FANCF | seqs_with_primer | pam | 355724 |
| NNN05m | FANCF | seqs_with_umi | proto | 191388 |
| NNN05m | FANCF | seqs_with_umi | pam | 335440 |
| NNN05m | FANCF | seqs_with_libbarcode | proto | 172498 |
| NNN05m | FANCF | seqs_with_libbarcode | pam | 254841 |
| NNN05m | FANCF | seqs_with_target | proto | 210319 |
| NNN05m | FANCF | seqs_with_target | pam | 277710 |
| NNN05m | FANCF | seqs_passing_all_filters | proto | 160805 |
| NNN05m | FANCF | seqs_passing_all_filters | pam | 240830 |
RNF2
make_qc_reads_plot(r1_qc_df, "RNF2")make_qc_table(r1_qc_df, "RNF2")| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN03m | RNF2 | seqs | proto | 504328 |
| NNN03m | RNF2 | seqs | pam | 297314 |
| NNN03m | RNF2 | seqs_with_primer | proto | 481040 |
| NNN03m | RNF2 | seqs_with_primer | pam | 284814 |
| NNN03m | RNF2 | seqs_with_umi | proto | 396864 |
| NNN03m | RNF2 | seqs_with_umi | pam | 266692 |
| NNN03m | RNF2 | seqs_with_libbarcode | proto | 345287 |
| NNN03m | RNF2 | seqs_with_libbarcode | pam | 202339 |
| NNN03m | RNF2 | seqs_with_target | proto | 430652 |
| NNN03m | RNF2 | seqs_with_target | pam | 222730 |
| NNN03m | RNF2 | seqs_passing_all_filters | proto | 325605 |
| NNN03m | RNF2 | seqs_passing_all_filters | pam | 191742 |
| NNN04m | RNF2 | seqs | proto | 180327 |
| NNN04m | RNF2 | seqs | pam | 222795 |
| NNN04m | RNF2 | seqs_with_primer | proto | 170052 |
| NNN04m | RNF2 | seqs_with_primer | pam | 213663 |
| NNN04m | RNF2 | seqs_with_umi | proto | 141928 |
| NNN04m | RNF2 | seqs_with_umi | pam | 199321 |
| NNN04m | RNF2 | seqs_with_libbarcode | proto | 122152 |
| NNN04m | RNF2 | seqs_with_libbarcode | pam | 151119 |
| NNN04m | RNF2 | seqs_with_target | proto | 150423 |
| NNN04m | RNF2 | seqs_with_target | pam | 166572 |
| NNN04m | RNF2 | seqs_passing_all_filters | proto | 113197 |
| NNN04m | RNF2 | seqs_passing_all_filters | pam | 143297 |
| NNN05m | RNF2 | seqs | proto | 366433 |
| NNN05m | RNF2 | seqs | pam | 298248 |
| NNN05m | RNF2 | seqs_with_primer | proto | 350485 |
| NNN05m | RNF2 | seqs_with_primer | pam | 285330 |
| NNN05m | RNF2 | seqs_with_umi | proto | 288418 |
| NNN05m | RNF2 | seqs_with_umi | pam | 267038 |
| NNN05m | RNF2 | seqs_with_libbarcode | proto | 245207 |
| NNN05m | RNF2 | seqs_with_libbarcode | pam | 197039 |
| NNN05m | RNF2 | seqs_with_target | proto | 304916 |
| NNN05m | RNF2 | seqs_with_target | pam | 216541 |
| NNN05m | RNF2 | seqs_passing_all_filters | proto | 232142 |
| NNN05m | RNF2 | seqs_passing_all_filters | pam | 186157 |
ONE-SEQ SCORE (replicate scatter plots)
All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score.
NOTE These include only barcodes that are in both samples.
############################################################
## CREATE SUMMARY DF
############################################################
r1_one_seq_score_plot_df <- r1_scores_df %>%
## retain only necessary cols, barcode will be identical between two datasets
select(barcode, oneseq_score, lab, target_site) %>%
## list the oneseq score under Karl and NIST-2 accordingly
pivot_wider(names_from = lab, values_from = oneseq_score, values_fill = NA) %>%
## drop any rows that contain NA because those were not found in one of the datasets
## comparing only those seqs in common between datasets
drop_na() %>%
group_by(target_site)
############################################################
## FUNCTION TO MAKE PLOTS
############################################################
## pass the above df and specify which cols to plot
make_oneseq_scatter_plot <- function(df, lab1, lab2){
## reduce df
plot_df <- df %>%
select(barcode, target_site, all_of(lab1), all_of(lab2))
## PLOT
plot_df %>%
ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) +
geom_point(alpha = 0.25) +
ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") +
facet_wrap(~target_site, scales = "free_x", ncol = 1) +
theme_bw() +
labs(x = lab1,
y =lab2,
title = "ONE-Seq Score of Common Barcodes")
}Replicate 1 vs. 2
make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN03m", "NNN04m")Replicate 2 vs. 3
make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN04m", "NNN05m")Replicate 1 vs. 3
make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN03m", "NNN05m")############################################################
## REMOVE OBJECT
############################################################
rm(r1_one_seq_score_plot_df)NORMALIZED COUNTS (replicate scatter plots)
Compare the normalized proto & pam counts that are found in the scores.tsv files.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################
## PROTO
## EMX1, FANCF, RNF2
normalized_counts_proto_df <- r1_scores_df %>%
select(barcode, normalized_counts_proto, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = normalized_counts_proto, values_fill = NA)
## PAM
## EMX1, FACNF, RNF2
normalized_counts_pam_df <- r1_scores_df %>%
select(barcode, normalized_counts_pam, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = normalized_counts_pam, values_fill = NA)
############################################################
## FUNCTION
############################################################
########################## PROTO ##########################
make.proto.norm.plot <- function(df, lab1, lab2){
plot_df <- df %>%
select(barcode, target_site, all_of(lab1), all_of(lab2)) %>%
drop_na() %>%
group_by(target_site)
## plot
plot_df %>%
ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) +
geom_point(alpha = 0.25) +
ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = lab1,
y = lab2,
title = "Normalized Counts of Common Barcodes: Proto")
}
########################## PAM ##########################
make.pam.norm.plot <- function(df, lab1, lab2){
plot_df <- df %>%
select(barcode, target_site, all_of(lab1), all_of(lab2)) %>%
drop_na() %>%
group_by(target_site)
## plot
plot_df %>%
ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) +
geom_point(alpha = 0.25) +
ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = lab1,
y = lab2,
title = "Normalized Counts of Common Barcodes: PAM")
}Replicate 1 vs. 2
make.proto.norm.plot(normalized_counts_proto_df, "NNN03m", "NNN04m")make.pam.norm.plot(normalized_counts_pam_df, "NNN03m", "NNN04m")Replicate 2 vs. 3
make.proto.norm.plot(normalized_counts_proto_df, "NNN04m", "NNN05m")make.pam.norm.plot(normalized_counts_pam_df, "NNN04m", "NNN05m")Replicate 3 vs. 4
make.proto.norm.plot(normalized_counts_proto_df, "NNN03m", "NNN05m")make.pam.norm.plot(normalized_counts_pam_df, "NNN03m", "NNN05m")############################################################
## REMOVE OBJECTS
############################################################
rm(normalized_counts_proto_df)
rm(normalized_counts_pam_df)RAW COUNTS (replicate scatter plots)
Compare the raw proto & pam counts that are calculated from the scores.tsv files.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################
## PROTO
## EMX1, FACNF, RNF2
proto_raw_read_count_df <- r1_scores_df %>%
select(barcode, proto_raw_read_count, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = proto_raw_read_count, values_fill = NA)
## PAM
## EMX1, FACNF, RNF2
pam_raw_read_count_df <- r1_scores_df %>%
select(barcode, pam_raw_read_count, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = pam_raw_read_count, values_fill = NA)
##########################################################
## FUNCTION
##########################################################
############################## PROTO ##############################
make.proto.raw.counts.plot <- function(df, lab1, lab2){
plot_df <- df %>%
select(barcode, target_site, lab1, lab2) %>%
drop_na() %>%
group_by(target_site)
## plot
plot_df %>%
ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) +
geom_point(alpha = 0.25) +
ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = lab1,
y = lab2,
title = "raw Counts of Common Barcodes: Proto")
}
############################## PAM ##############################
make.pam.raw.counts.plot <- function(df, lab1, lab2){
plot_df <- df %>%
select(barcode, target_site, lab1, lab2) %>%
drop_na() %>%
group_by(target_site)
## plot
plot_df %>%
ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) +
geom_point(alpha = 0.25) +
ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") +
facet_wrap(~target_site, ncol = 1) +
theme_bw() +
labs(x = lab1,
y = lab2,
title = "raw Counts of Common Barcodes: PAM")
}Replicate 1 vs. 2
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN03m", "NNN04m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN03m", "NNN04m")Replicate 2 vs. 3
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN05m", "NNN05m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN05m", "NNN05m")Replicate 1 vs. 3
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN03m", "NNN05m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN03m", "NNN05m")##########################################################
## REMOVE OBJECTS
##########################################################
rm(proto_raw_read_count_df)
rm(pam_raw_read_count_df)VENN BAR: PER TARGET BARCODE COMPARISON
How many barcodes are found among triplicates for each target site? NOTE no count values are taken into consideration.
########################################################################
# DF CURATION
########################################################################
r1_venn_barcodes_df <- r1_scores_df %>%
## DEFINED AT BEGINNING OF R1 SECTION
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
########################################################################
# PLOT
########################################################################
ggplotly(ggplot(data = r1_venn_barcodes_df) +
geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set),
# position = "fill"
position = "fill") +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Triplicate Combinations",
title = "R1 Replicate Barcodes") +
scale_fill_manual(values=cbPalette))########################################################################
# TABLE
########################################################################
(r1_venn_table_df <- r1_venn_barcodes_df %>%
group_by(target_site) %>%
count(lab_set) %>%
group_by(target_site) %>%
mutate(target_site_total = sum(n)) %>%
mutate(pct = round(n/target_site_total * 100, 0),
count_value = paste0(n, " (", pct, "%)")) %>%
select(-n, -pct) %>%
pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | target_site_total | NNN03m | NNN03m-NNN04m | NNN03m-NNN04m-NNN05m | NNN03m-NNN05m | NNN04m | NNN04m-NNN05m | NNN05m |
|---|---|---|---|---|---|---|---|---|
| EMX1 | 41662 | 4807 (12%) | 4881 (12%) | 9662 (23%) | 4097 (10%) | 6709 (16%) | 5769 (14%) | 5737 (14%) |
| FANCF | 22144 | 6650 (30%) | 3819 (17%) | 3872 (17%) | 2786 (13%) | 2328 (11%) | 897 (4%) | 1792 (8%) |
| RNF2 | 21867 | 3248 (15%) | 2288 (10%) | 4964 (23%) | 3979 (18%) | 1878 (9%) | 2140 (10%) | 3370 (15%) |
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT
########################################################################
rm(r1_venn_table_df)BARCODE CONCORDANCE TABLE
Table that breaks down the number of barcodes that are:
- Not concordant (present in only 1 replicate)
- Concordant between 2 replicates
- Concordant in all 3 replicates
NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.
########################################################################
# FUNCTION
########################################################################
make_concordance_table <- function(df, pam_proto){
if(pam_proto == "proto"){
(df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(proto_raw_read_count > 0) %>%
select(-proto_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
group_by(target_site, lab_num) %>%
tally() %>%
mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
pivot_wider(names_from = lab_num,
values_from = n,
values_fill = NA) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed",
"responsive", html_font = "Cambria")))
} else if(pam_proto == "pam"){
(df %>%
select(barcode, target_site, lab, pam_raw_read_count) %>%
## include only those with reads of at least 1
filter(pam_raw_read_count > 0) %>%
select(-pam_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
group_by(target_site, lab_num) %>%
tally() %>%
mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
pivot_wider(names_from = lab_num,
values_from = n,
values_fill = NA) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed",
"responsive", html_font = "Cambria")))
} else {
message("Enter 'pam' or 'proto' for second arg")
}
}PROTO
make_concordance_table(r1_scores_df, "proto")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 | concordant_in_3 |
|---|---|---|---|
| EMX1 | 15331 | 5563 | 2984 |
| FANCF | 7910 | 1973 | 1175 |
| RNF2 | 8727 | 2909 | 968 |
PAM
make_concordance_table(r1_scores_df, "pam")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 | concordant_in_3 |
|---|---|---|---|
| EMX1 | 17842 | 11169 | 5484 |
| FANCF | 10170 | 5689 | 2731 |
| RNF2 | 9257 | 6340 | 2996 |
DENSITY PLOTS
Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted.
###########################################################################
####### FUNCTION
###########################################################################
cbPalette <- c("#000000",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7",
"#999999",
"#20F03B",
"#FEC44F",
"#D95F0E",
"#756BB1",
"#FF8A33",
"#D7BE07",
"#A807D7",
"dodgerblue2",
"#E31A1C",
"green4",
"#6A3D9A",
"#FF7F00",
"black",
"gold1",
"skyblue2",
"#FB9A99",
"palegreen2",
"#CAB2D6",
"#FDBF6F",
"gray70",
"khaki2",
"maroon",
"orchid1",
"deeppink1",
"blue1",
"steelblue4",
"darkturquoise",
"green1",
"yellow4",
"yellow3",
"darkorange4",
"brown")
######################### PROTO
make.proto.density.plots <- function(df, lablist){
labs <- c(lablist)
density_plot_labset_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
proto_density_plot_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site, proto_raw_read_count) %>%
right_join(density_plot_labset_df) %>%
count(target_site, lab_set, proto_raw_read_count, name = "Occurrence")
options(scipen = 999)
plot <- ggplot(proto_density_plot_df) +
geom_histogram(aes(x = proto_raw_read_count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual") +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Proto Raw Read Counts") +
scale_fill_manual(values=cbPalette)
return(plot)
}
########################## PAM
make.pam.density.plots <- function(df, lablist){
labs <- c(lablist)
density_plot_labset_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
pam_density_plot_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site, pam_raw_read_count) %>%
right_join(density_plot_labset_df) %>%
count(target_site, lab_set, pam_raw_read_count, name = "Occurrence")
options(scipen = 999)
plot <- ggplot(pam_density_plot_df) +
geom_histogram(aes(x = pam_raw_read_count, fill = lab_set), position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual") +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "PAM Raw Read Counts") +
scale_fill_manual(values=cbPalette)
return(plot)
}PROTO
lab_list <- c("NNN03m", "NNN04m", "NNN05m")
ggplotly(make.proto.density.plots(r1_scores_df, lab_list))PAM
ggplotly(make.pam.density.plots(r1_scores_df, lab_list))DENSITY PLOT: COMBINED PROTO & PAM
For each barcode of each triplicate, take the lowest read count value between proto and pam.
###########################################################################
####### GATHER MIN PAM/PROTO READ VALUE FOR EACH TRIPLICATE
###########################################################################
NNN03m_counts_df <- r1_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN03m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
NNN04m_counts_df <- r1_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN04m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
NNN05m_counts_df <- r1_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN05m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
###########################################################################
####### CAT EACH TRIPLICATE DF INTO 1 COHESIVE DF
###########################################################################
r1_combined_proto_pam_df <- plyr::rbind.fill(NNN03m_counts_df,
NNN04m_counts_df,
NNN05m_counts_df)
lab_list <- c("NNN03m", "NNN04m", "NNN05m")
###########################################################################
####### FUNCTION
###########################################################################
make.combined.density.plots <- function(df, lablist){
labs <- c(lablist)
density_plot_labset_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
proto_density_plot_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site, min_raw_pam_proto_count) %>%
right_join(density_plot_labset_df) %>%
count(target_site, lab_set, min_raw_pam_proto_count, name = "Occurrence")
options(scipen = 999)
plot <- ggplot(proto_density_plot_df) +
geom_histogram(aes(x = min_raw_pam_proto_count, fill = lab_set),
position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual") +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
scale_fill_manual(values=cbPalette)
return(plot)
}make.combined.density.plots(r1_combined_proto_pam_df, lab_list)R2 REPLICATE COMPARISONS
Create a df with R2 data only.
########################################################################
# DEFINE LABS
########################################################################
## create r2 df: NNN02, NNN03, NNN04
labs <- c("NNN06m", "NNN07m", "NNN08m")
########################################################################
# CREATE R2 SCORES DF
########################################################################
r2_scores_df <- scores_df %>%
filter(lab %in% labs)
########################################################################
# CREATE R2 QC DF
########################################################################
r2_qc_df <- qc_df %>%
filter(lab %in% labs)TOTAL NUMBER OF BARCODES
The total number of barcodes with at least 1 read for each triplicate & target site.
########################################################################
# CREATE MINIMAL DF THAT DESCRIBES TOTAL BARCODES
########################################################################
total_barcodes_plot_df <- r2_scores_df %>%
group_by(target_site, lab) %>%
count(target_site, lab, name = "total_barcodes") %>%
arrange(target_site, lab)
########################################################################
# PLOT
########################################################################
ggplotly(ggplot(total_barcodes_plot_df) +
geom_bar(aes(x = target_site, y = total_barcodes, fill = lab),
# dodge prserves vertical position, single groups bars of same metric together
# stat = identity skips aggregation of values - I provide values
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) + # add commas to axis label numbers
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Barcodes",
fill = "Lab", main = "Total Number of Barcodes") +
scale_fill_manual(values=cbPalette))########################################################################
# TABLE
########################################################################
(total_barcodes_plot_df %>%
pivot_wider(names_from = "lab",
values_from = "total_barcodes",
values_fill = 0) %>%
rowwise() %>%
mutate(AVG = mean(c(NNN06m, NNN07m, NNN08m)),
SD = sd(c(NNN06m, NNN07m, NNN08m)),
CV = SD / AVG) %>%
arrange(desc(CV)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | NNN06m | NNN07m | NNN08m | AVG | SD | CV |
|---|---|---|---|---|---|---|
| EMX1 | 29895 | 25130 | 24803 | 26609.33 | 2850.1643 | 0.1071114 |
| FANCF | 13038 | 10535 | 11786 | 11786.33 | 1251.5000 | 0.1061823 |
| RNF2 | 16074 | 15747 | 14851 | 15557.33 | 633.1764 | 0.0406995 |
########################################################################
# REMOVE ENV VARS
########################################################################
rm(total_barcodes_plot_df)QC READ COUNTS
From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category, plotted for each target site.
EMX1
## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "EMX1")make_qc_table(r2_qc_df, "EMX1")## Warning: Unknown or uninitialised column: `labs`.
| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN06m | EMX1 | seqs | proto | 617312 |
| NNN06m | EMX1 | seqs | pam | 775065 |
| NNN06m | EMX1 | seqs_with_primer | proto | 589315 |
| NNN06m | EMX1 | seqs_with_primer | pam | 742960 |
| NNN06m | EMX1 | seqs_with_umi | proto | 492883 |
| NNN06m | EMX1 | seqs_with_umi | pam | 703540 |
| NNN06m | EMX1 | seqs_with_libbarcode | proto | 440147 |
| NNN06m | EMX1 | seqs_with_libbarcode | pam | 539561 |
| NNN06m | EMX1 | seqs_with_target | proto | 533292 |
| NNN06m | EMX1 | seqs_with_target | pam | 586224 |
| NNN06m | EMX1 | seqs_passing_all_filters | proto | 415366 |
| NNN06m | EMX1 | seqs_passing_all_filters | pam | 510987 |
| NNN07m | EMX1 | seqs | proto | 669362 |
| NNN07m | EMX1 | seqs | pam | 678091 |
| NNN07m | EMX1 | seqs_with_primer | proto | 642338 |
| NNN07m | EMX1 | seqs_with_primer | pam | 652737 |
| NNN07m | EMX1 | seqs_with_umi | proto | 541210 |
| NNN07m | EMX1 | seqs_with_umi | pam | 618541 |
| NNN07m | EMX1 | seqs_with_libbarcode | proto | 472278 |
| NNN07m | EMX1 | seqs_with_libbarcode | pam | 448568 |
| NNN07m | EMX1 | seqs_with_target | proto | 570877 |
| NNN07m | EMX1 | seqs_with_target | pam | 488057 |
| NNN07m | EMX1 | seqs_passing_all_filters | proto | 448146 |
| NNN07m | EMX1 | seqs_passing_all_filters | pam | 427144 |
| NNN08m | EMX1 | seqs | proto | 622705 |
| NNN08m | EMX1 | seqs | pam | 629810 |
| NNN08m | EMX1 | seqs_with_primer | proto | 596404 |
| NNN08m | EMX1 | seqs_with_primer | pam | 606283 |
| NNN08m | EMX1 | seqs_with_umi | proto | 500464 |
| NNN08m | EMX1 | seqs_with_umi | pam | 573867 |
| NNN08m | EMX1 | seqs_with_libbarcode | proto | 435471 |
| NNN08m | EMX1 | seqs_with_libbarcode | pam | 413872 |
| NNN08m | EMX1 | seqs_with_target | proto | 527930 |
| NNN08m | EMX1 | seqs_with_target | pam | 451407 |
| NNN08m | EMX1 | seqs_passing_all_filters | proto | 412352 |
| NNN08m | EMX1 | seqs_passing_all_filters | pam | 394733 |
FANCF
## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "FANCF")make_qc_table(r2_qc_df, "FANCF")## Warning: Unknown or uninitialised column: `labs`.
| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN06m | FANCF | seqs | proto | 528619 |
| NNN06m | FANCF | seqs | pam | 584983 |
| NNN06m | FANCF | seqs_with_primer | proto | 506657 |
| NNN06m | FANCF | seqs_with_primer | pam | 563253 |
| NNN06m | FANCF | seqs_with_umi | proto | 435878 |
| NNN06m | FANCF | seqs_with_umi | pam | 535807 |
| NNN06m | FANCF | seqs_with_libbarcode | proto | 393694 |
| NNN06m | FANCF | seqs_with_libbarcode | pam | 421416 |
| NNN06m | FANCF | seqs_with_target | proto | 472142 |
| NNN06m | FANCF | seqs_with_target | pam | 457092 |
| NNN06m | FANCF | seqs_passing_all_filters | proto | 373918 |
| NNN06m | FANCF | seqs_passing_all_filters | pam | 402064 |
| NNN07m | FANCF | seqs | proto | 361966 |
| NNN07m | FANCF | seqs | pam | 432023 |
| NNN07m | FANCF | seqs_with_primer | proto | 346021 |
| NNN07m | FANCF | seqs_with_primer | pam | 415374 |
| NNN07m | FANCF | seqs_with_umi | proto | 297099 |
| NNN07m | FANCF | seqs_with_umi | pam | 395643 |
| NNN07m | FANCF | seqs_with_libbarcode | proto | 266494 |
| NNN07m | FANCF | seqs_with_libbarcode | pam | 308255 |
| NNN07m | FANCF | seqs_with_target | proto | 319474 |
| NNN07m | FANCF | seqs_with_target | pam | 333736 |
| NNN07m | FANCF | seqs_passing_all_filters | proto | 252081 |
| NNN07m | FANCF | seqs_passing_all_filters | pam | 293366 |
| NNN08m | FANCF | seqs | proto | 463516 |
| NNN08m | FANCF | seqs | pam | 467310 |
| NNN08m | FANCF | seqs_with_primer | proto | 445307 |
| NNN08m | FANCF | seqs_with_primer | pam | 449773 |
| NNN08m | FANCF | seqs_with_umi | proto | 385596 |
| NNN08m | FANCF | seqs_with_umi | pam | 428270 |
| NNN08m | FANCF | seqs_with_libbarcode | proto | 347129 |
| NNN08m | FANCF | seqs_with_libbarcode | pam | 339337 |
| NNN08m | FANCF | seqs_with_target | proto | 412601 |
| NNN08m | FANCF | seqs_with_target | pam | 367216 |
| NNN08m | FANCF | seqs_passing_all_filters | proto | 330488 |
| NNN08m | FANCF | seqs_passing_all_filters | pam | 323445 |
RNF2
## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "RNF2")make_qc_table(r2_qc_df, "RNF2")## Warning: Unknown or uninitialised column: `labs`.
| lab | target_site | var | proto_pam | read_count |
|---|---|---|---|---|
| NNN06m | RNF2 | seqs | proto | 689586 |
| NNN06m | RNF2 | seqs | pam | 443239 |
| NNN06m | RNF2 | seqs_with_primer | proto | 662104 |
| NNN06m | RNF2 | seqs_with_primer | pam | 427084 |
| NNN06m | RNF2 | seqs_with_umi | proto | 556713 |
| NNN06m | RNF2 | seqs_with_umi | pam | 404090 |
| NNN06m | RNF2 | seqs_with_libbarcode | proto | 460917 |
| NNN06m | RNF2 | seqs_with_libbarcode | pam | 293395 |
| NNN06m | RNF2 | seqs_with_target | proto | 564109 |
| NNN06m | RNF2 | seqs_with_target | pam | 319872 |
| NNN06m | RNF2 | seqs_passing_all_filters | proto | 437965 |
| NNN06m | RNF2 | seqs_passing_all_filters | pam | 279851 |
| NNN07m | RNF2 | seqs | proto | 312944 |
| NNN07m | RNF2 | seqs | pam | 286467 |
| NNN07m | RNF2 | seqs_with_primer | proto | 299683 |
| NNN07m | RNF2 | seqs_with_primer | pam | 276410 |
| NNN07m | RNF2 | seqs_with_umi | proto | 249380 |
| NNN07m | RNF2 | seqs_with_umi | pam | 261450 |
| NNN07m | RNF2 | seqs_with_libbarcode | proto | 216234 |
| NNN07m | RNF2 | seqs_with_libbarcode | pam | 203807 |
| NNN07m | RNF2 | seqs_with_target | proto | 265057 |
| NNN07m | RNF2 | seqs_with_target | pam | 221851 |
| NNN07m | RNF2 | seqs_passing_all_filters | proto | 204899 |
| NNN07m | RNF2 | seqs_passing_all_filters | pam | 194637 |
| NNN08m | RNF2 | seqs | proto | 317635 |
| NNN08m | RNF2 | seqs | pam | 341128 |
| NNN08m | RNF2 | seqs_with_primer | proto | 304781 |
| NNN08m | RNF2 | seqs_with_primer | pam | 328764 |
| NNN08m | RNF2 | seqs_with_umi | proto | 256586 |
| NNN08m | RNF2 | seqs_with_umi | pam | 311342 |
| NNN08m | RNF2 | seqs_with_libbarcode | proto | 212667 |
| NNN08m | RNF2 | seqs_with_libbarcode | pam | 233009 |
| NNN08m | RNF2 | seqs_with_target | proto | 260487 |
| NNN08m | RNF2 | seqs_with_target | pam | 254010 |
| NNN08m | RNF2 | seqs_passing_all_filters | proto | 201734 |
| NNN08m | RNF2 | seqs_passing_all_filters | pam | 222341 |
ONE-SEQ SCORE (replicate scatter plots)
All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score.
NOTE These include only barcodes that are in both samples.
############################################################
## CREATE SUMMARY DF
############################################################
r2_one_seq_score_plot_df <- r2_scores_df %>%
## retain only necessary cols, barcode will be identical between two datasets
select(barcode, oneseq_score, lab, target_site) %>%
## list the oneseq score under Karl and NIST-2 accordingly
pivot_wider(names_from = lab, values_from = oneseq_score, values_fill = NA) %>%
## drop any rows that contain NA because those were not found in one of the datasets
## comparing only those seqs in common between datasets
drop_na() %>%
group_by(target_site)Replicate 1 vs. 2
## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN06m", "NNN07m")Replicate 2 vs. 3
## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN07m", "NNN08m")Replicate 1 vs. 3
## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN06m", "NNN08m")############################################################
## REMOVE OBJECT
############################################################
rm(r2_one_seq_score_plot_df)NORMALIZED COUNTS (replicate scatter plots)
Compare the normalized proto & pam counts that are found in the scores.tsv files.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################
################################### PROTO ##################
## EMX1, FANCF, RNF2
normalized_counts_proto_df <- r2_scores_df %>%
select(barcode, normalized_counts_proto, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = normalized_counts_proto, values_fill = NA)
################################### PAM ####################
## EMX1, FACNF, RNF2
normalized_counts_pam_df <- r2_scores_df %>%
select(barcode, normalized_counts_pam, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = normalized_counts_pam, values_fill = NA)Replicate 1 vs. 2
## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN06m", "NNN07m")make.pam.norm.plot(normalized_counts_pam_df, "NNN06m", "NNN07m")Replicate 2 vs. 3
## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN07m", "NNN08m")make.pam.norm.plot(normalized_counts_pam_df, "NNN07m", "NNN08m")Replicate 3 vs. 4
## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN06m", "NNN08m")make.pam.norm.plot(normalized_counts_pam_df, "NNN06m", "NNN08m")############################################################
## REMOVE OBJECTS
############################################################
rm(normalized_counts_proto_df)
rm(normalized_counts_pam_df)RAW COUNTS (replicate scatter plots)
Compare the raw proto & pam counts that are calculated from the scores.tsv files.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
######################### PROTO
## EMX1, FACNF, RNF2
proto_raw_read_count_df <- r2_scores_df %>%
select(barcode, proto_raw_read_count, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = proto_raw_read_count, values_fill = NA)
######################### PAM
## EMX1, FACNF, RNF2
pam_raw_read_count_df <- r2_scores_df %>%
select(barcode, pam_raw_read_count, lab, target_site) %>%
pivot_wider(names_from = lab, values_from = pam_raw_read_count, values_fill = NA)Replicate 1 vs. 2
## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN06m", "NNN07m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN06m", "NNN07m")Replicate 2 vs. 3
## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN07m", "NNN08m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN07m", "NNN08m")Replicate 1 vs. 3
## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN06m", "NNN08m")make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN06m", "NNN08m")##########################################################
## REMOVE OBJECTS
##########################################################
rm(proto_raw_read_count_df)
rm(pam_raw_read_count_df)VENN BAR: PER TARGET BARCODE COMPARISON
How many barcodes are found among triplicates for each target site? NOTE no count values are taken into consideration.
##########################################################
## DF CURATION
##########################################################
r2_venn_barcodes_df <- r2_scores_df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
##########################################################
## PLOT
##########################################################
ggplotly(ggplot(data = r2_venn_barcodes_df) +
geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set),
# position = "fill"
position = "fill") +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Triplicate Combinations",
title = "R2 Replicate Barcodes") +
scale_fill_manual(values=cbPalette))##########################################################
## TABLE
##########################################################
(r2_venn_table_df <- r2_venn_barcodes_df %>%
group_by(target_site) %>%
count(lab_set) %>%
group_by(target_site) %>%
mutate(target_site_total = sum(n)) %>%
mutate(pct = round(n/target_site_total * 100, 0),
count_value = paste0(n, " (", pct, "%)")) %>%
select(-n, -pct) %>%
pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | target_site_total | NNN06m | NNN06m-NNN07m | NNN06m-NNN07m-NNN08m | NNN06m-NNN08m | NNN07m | NNN07m-NNN08m | NNN08m |
|---|---|---|---|---|---|---|---|---|
| EMX1 | 42608 | 7207 (17%) | 6004 (14%) | 10831 (25%) | 5853 (14%) | 4594 (11%) | 3701 (9%) | 4418 (10%) |
| FANCF | 20772 | 4155 (20%) | 2145 (10%) | 3975 (19%) | 2763 (13%) | 2686 (13%) | 1729 (8%) | 3319 (16%) |
| RNF2 | 23262 | 2575 (11%) | 3379 (15%) | 7255 (31%) | 2865 (12%) | 2457 (11%) | 2656 (11%) | 2075 (9%) |
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT
########################################################################
rm(r2_venn_table_df)BARCODE CONCORDANCE TABLE
Table that breaks down the number of barcodes that are:
- Not concordant (present in only 1 replicate)
- Concordant between 2 replicates
- Concordant in all 3 replicates
NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.
PROTO
## FUNCTION DEFINED IN R1 SECTION
make_concordance_table(r2_scores_df, "proto")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 | concordant_in_3 |
|---|---|---|---|
| EMX1 | 14110 | 4514 | 2611 |
| FANCF | 7645 | 2244 | 1470 |
| RNF2 | 8721 | 3148 | 1236 |
PAM
## FUNCTION DEFINED IN R1 SECTION
make_concordance_table(r2_scores_df, "pam")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 | concordant_in_3 |
|---|---|---|---|
| EMX1 | 17296 | 12897 | 7367 |
| FANCF | 9438 | 4641 | 2525 |
| RNF2 | 8269 | 7617 | 4837 |
DENSITY PLOTS
Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted.
PROTO
## FUNCTION DEFINED IN R1 SECTION
lab_list <- c("NNN06m", "NNN07m", "NNN08m")
ggplotly(make.proto.density.plots(r2_scores_df, lab_list))PAM
## FUNCTION DEFINED IN R1 SECTION
ggplotly(make.pam.density.plots(r2_scores_df, lab_list))DENSITY PLOT: COMBINED PROTO & PAM
For each barcode of each triplicate, take the lowest read count value between proto and pam.
###########################################################################
####### GATHER MIN PAM/PROTO READ VALUE FOR EACH TRIPLICATE
###########################################################################
NNN06m_counts_df <- r2_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN06m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
NNN07m_counts_df <- r2_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN07m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
NNN08m_counts_df <- r2_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count,
proto_raw_read_count) %>%
filter(lab == "NNN08m") %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count,
proto_raw_read_count,
na.rm=TRUE)) %>%
select(-pam_raw_read_count, -proto_raw_read_count)
###########################################################################
####### CAT EACH TRIPLICATE DF INTO 1 COHESIVE DF
###########################################################################
r2_combined_proto_pam_df <- plyr::rbind.fill(NNN06m_counts_df,
NNN07m_counts_df,
NNN08m_counts_df)
lab_list <- c("NNN06m", "NNN07m", "NNN08m")
###########################################################################
####### FUNCTION
###########################################################################
make.combined.density.plots <- function(df, lablist){
labs <- c(lablist)
density_plot_labset_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
proto_density_plot_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site, min_raw_pam_proto_count) %>%
right_join(density_plot_labset_df) %>%
count(target_site, lab_set, min_raw_pam_proto_count, name = "Occurrence")
options(scipen = 999)
plot <- ggplot(proto_density_plot_df) +
geom_histogram(aes(x = min_raw_pam_proto_count, fill = lab_set),
position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual") +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
scale_fill_manual(values=cbPalette)
return(plot)
}make.combined.density.plots(r2_combined_proto_pam_df, lab_list)R1 vs. R2
For each set of triplicates (R1 and R2) take the lowest non-zero value among all triplicates for each barcode. If there is at least 1 read count among any of the triplicates, that value is recorded.
Example data frame showing how selection occurs
Concatenate triplicates for each target site so we can compare R1 to R2 as a whole.
## create r2 df: NNN02, NNN03, NNN04
labs <- c("NNN03m", "NNN04m", "NNN05m", "NNN06m", "NNN07m", "NNN08m")
##########################################################
########################## R1 ###########################
##########################################################
############### CREATE DF FOR EACH CATEGORY ##############
####### NORMALIZED PAM
r1_norm_pam_df <- r1_scores_df %>%
select(barcode, normalized_counts_pam, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = normalized_counts_pam,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_norm_pam_count = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### NORMALIZED PROTO
r1_norm_proto_df <- r1_scores_df %>%
select(barcode, normalized_counts_proto, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = normalized_counts_proto,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_norm_proto_count = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### PAM ON TARGET
r1_pam_target_df <- r1_scores_df %>%
select(barcode, target_site, pam_on_target_reads) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = pam_on_target_reads,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_pam_on_target_read = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### PROTO ON TARGET
r1_proto_target_df <- r1_scores_df %>%
select(barcode, target_site, proto_on_target_reads) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = proto_on_target_reads,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_proto_on_target_read = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### RAW PAM
r1_raw_pam_df <- r1_scores_df %>%
select(barcode, target_site, pam_raw_read_count) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = pam_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_pam_raw_read_count = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### RAW PROTO
r1_raw_proto_df <- r1_scores_df %>%
select(barcode, target_site, proto_raw_read_count) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = proto_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_proto_raw_read_count = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
####### ONESEQ SCORE
r1_oneseq_df <- r1_scores_df %>%
select(barcode, oneseq_score, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = oneseq_score,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_min_oneseq_score = pmin(NNN03m, NNN04m, NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)
############### COMBINE DFS ##############
r1_combined_df <- r1_norm_pam_df %>%
left_join(r1_norm_proto_df) %>%
left_join(r1_pam_target_df) %>%
left_join(r1_proto_target_df) %>%
left_join(r1_raw_pam_df) %>%
left_join(r1_raw_proto_df) %>%
left_join(r1_oneseq_df) %>%
mutate(dataset = "R1") %>%
rename(min_norm_pam_count = R1_min_norm_pam_count,
min_norm_proto_count = R1_min_norm_proto_count,
min_pam_on_target_read = R1_min_pam_on_target_read,
min_proto_on_target_read = R1_min_proto_on_target_read,
min_oneseq_score = R1_min_oneseq_score,
min_pam_raw_read_count = R1_min_pam_raw_read_count,
min_proto_raw_read_count = R1_min_proto_raw_read_count)
##########################################################
########################## R1 ###########################
##########################################################
############### CREATE DF FOR EACH CATEGORY ##############
####### NORMALIZED PAM
r2_norm_pam_df <- r2_scores_df %>%
select(barcode, normalized_counts_pam, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = normalized_counts_pam,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_norm_pam_count = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### NORMALIZED PROTO
r2_norm_proto_df <- r2_scores_df %>%
select(barcode, normalized_counts_proto, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = normalized_counts_proto,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_norm_proto_count = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### ON-TARGET PAM
r2_pam_target_df <- r2_scores_df %>%
select(barcode, target_site, pam_on_target_reads) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = pam_on_target_reads,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_pam_on_target_read = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### ON-TARGET PROTO
r2_proto_target_df <- r2_scores_df %>%
select(barcode, target_site, proto_on_target_reads) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = proto_on_target_reads,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_proto_on_target_read = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### RAW PAM
r2_raw_pam_df <- r2_scores_df %>%
select(barcode, target_site, pam_raw_read_count) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = pam_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_pam_raw_read_count = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### RAW PROTO
r2_raw_proto_df <- r2_scores_df %>%
select(barcode, target_site, proto_raw_read_count) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = proto_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_proto_raw_read_count = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
####### ONESEQ SCORE
r2_oneseq_df <- r2_scores_df %>%
select(barcode, oneseq_score, target_site) %>%
group_by(barcode, target_site) %>%
pivot_wider(names_from = lab,
values_from = oneseq_score,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(r2_min_oneseq_score = pmin(NNN06m, NNN07m, NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m)
############### COMBINE DFS ##############
r2_combined_df <- r2_norm_pam_df %>%
left_join(r2_norm_proto_df) %>%
left_join(r2_pam_target_df) %>%
left_join(r2_proto_target_df) %>%
left_join(r2_raw_pam_df) %>%
left_join(r2_raw_proto_df) %>%
left_join(r2_oneseq_df) %>%
mutate(dataset = "R2") %>%
rename(min_norm_pam_count = r2_min_norm_pam_count,
min_norm_proto_count = r2_min_norm_proto_count,
min_pam_on_target_read = r2_min_pam_on_target_read,
min_proto_on_target_read = r2_min_proto_on_target_read,
min_oneseq_score = r2_min_oneseq_score,
min_pam_raw_read_count = r2_min_pam_raw_read_count,
min_proto_raw_read_count = r2_min_proto_raw_read_count)
##########################################################
## CREATE A COMBINED R1/R2 DF ##
##########################################################
r1_r2_scores_df <- plyr::rbind.fill(r1_combined_df, r2_combined_df)
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_norm_pam_df)
rm(r1_norm_proto_df)
rm(r1_pam_target_df)
rm(r1_proto_target_df)
rm(r1_raw_pam_df)
rm(r1_raw_proto_df)
rm(r1_oneseq_df)
rm(r2_norm_pam_df)
rm(r2_norm_proto_df)
rm(r2_pam_target_df)
rm(r2_proto_target_df)
rm(r2_raw_pam_df)
rm(r2_raw_proto_df)
rm(r2_oneseq_df)TOTAL NUMBER OF BARCODES
The total number of barcodes with at least 1 read for each sample & target site. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
######################################################################
## curate summary df to plot number of off-target sits per target site & lab
r1_total_off_target_sites_plot_df <- r1_combined_df %>%
group_by(target_site) %>%
count(target_site, name = "r1_total_off_target_sites") %>%
arrange(target_site)
r2_total_off_target_sites_plot_df <- r2_combined_df %>%
group_by(target_site) %>%
count(target_site, name = "r2_total_off_target_sites") %>%
arrange(target_site)
r1_r2_total_off_target_sites_df <- r1_total_off_target_sites_plot_df %>%
left_join(r2_total_off_target_sites_plot_df) %>%
pivot_longer(cols = c("r1_total_off_target_sites",
"r2_total_off_target_sites"),
names_to = "replicate", values_to = "value")%>%
mutate(replicate = str_remove(replicate, "_total_off_target_sites")) %>%
mutate(replicate = str_replace(replicate, "r1", "R1"),
replicate = str_replace(replicate, "r2", "R2"))
##########
## plot ##
##########
ggplotly(ggplot(r1_r2_total_off_target_sites_df) +
geom_bar(aes(x = target_site, y = value, fill = replicate),
# dodge prserves vertical position, single groups bars of same metric together
# stat = identity skips aggregation of values - I provide values
position = position_dodge(preserve = "single"), stat = "identity") +
scale_y_continuous(labels = comma) + # add commas to axis label numbers
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Target Site", y = "Total Number of Sites",
fill = "Lab", main = "Total Number of Barcodes") +
scale_fill_manual(values=cbPalette))(r1_r2_total_off_target_sites_df %>%
rename(total_sites = value) %>%
pivot_wider(names_from = "replicate",
values_from = "total_sites",
values_fill = 0) %>%
rowwise() %>%
mutate(AVG = mean(c(R1, R2)),
SD = sd(c(R1, R2)),
CV = SD / AVG) %>%
arrange(desc(CV)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | R1 | R2 | AVG | SD | CV |
|---|---|---|---|---|---|
| FANCF | 22144 | 20772 | 21458.0 | 970.1505 | 0.0452116 |
| RNF2 | 21867 | 23262 | 22564.5 | 986.4140 | 0.0437153 |
| EMX1 | 41662 | 42608 | 42135.0 | 668.9230 | 0.0158757 |
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_total_off_target_sites_plot_df)
rm(r2_total_off_target_sites_plot_df)QC READ COUNTS
From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
########################################################
## CURATE R1 DF
########################################################
r1_proto_qc_df <- r1_qc_df %>%
select(-pam) %>%
pivot_wider(names_from = lab,
values_from = proto,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_proto_value = pmin(NNN03m, NNN04m, NNN05m, na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m) %>%
mutate(lab = "R1") %>%
rename(proto = R1_proto_value)
r1_pam_qc_df <- r1_qc_df %>%
select(-proto) %>%
pivot_wider(names_from = lab,
values_from = pam,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R1_pam_value = pmin(NNN03m, NNN04m, NNN05m, na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m)%>%
mutate(lab = "R1") %>%
rename(pam = R1_pam_value)
r1_combined_qc_df <- r1_proto_qc_df %>%
left_join(r1_pam_qc_df)## Joining, by = c("target_site", "var", "lab")
########################################################
## CURATE R2 DF
########################################################
r2_proto_qc_df <- r2_qc_df %>%
select(-pam) %>%
pivot_wider(names_from = lab,
values_from = proto,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R2_proto_value = pmin(NNN06m, NNN07m, NNN08m, na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m) %>%
mutate(lab = "R2") %>%
rename(proto = R2_proto_value)
r2_pam_qc_df <- r2_qc_df %>%
select(-proto) %>%
pivot_wider(names_from = lab,
values_from = pam,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(R2_pam_value = pmin(NNN06m, NNN07m, NNN08m, na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m) %>%
mutate(lab = "R2") %>%
rename(pam = R2_pam_value)
r2_combined_qc_df <- r2_proto_qc_df %>%
left_join(r2_pam_qc_df) ## Joining, by = c("target_site", "var", "lab")
########################################################
## JOIN R1 R2 DFS
########################################################
r1_r2_qc_df <- plyr::rbind.fill(r1_combined_qc_df, r2_combined_qc_df)
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_proto_qc_df)
rm(r1_pam_qc_df)
rm(r2_proto_qc_df)
rm(r2_pam_qc_df)
rm(r1_combined_qc_df)
rm(r2_combined_qc_df)EMX1
make_qc_reads_plot(r1_r2_qc_df, "EMX1")make_qc_table(r1_r2_qc_df, "EMX1")| target_site | var | lab | proto_pam | read_count |
|---|---|---|---|---|
| EMX1 | seqs | R1 | proto | 489072 |
| EMX1 | seqs | R1 | pam | 437017 |
| EMX1 | seqs_with_primer | R1 | proto | 466871 |
| EMX1 | seqs_with_primer | R1 | pam | 418686 |
| EMX1 | seqs_with_umi | R1 | proto | 386126 |
| EMX1 | seqs_with_umi | R1 | pam | 392840 |
| EMX1 | seqs_with_libbarcode | R1 | proto | 343483 |
| EMX1 | seqs_with_libbarcode | R1 | pam | 286664 |
| EMX1 | seqs_with_target | R1 | proto | 420164 |
| EMX1 | seqs_with_target | R1 | pam | 314442 |
| EMX1 | seqs_passing_all_filters | R1 | proto | 324280 |
| EMX1 | seqs_passing_all_filters | R1 | pam | 271902 |
| EMX1 | seqs | R2 | proto | 617312 |
| EMX1 | seqs | R2 | pam | 629810 |
| EMX1 | seqs_with_primer | R2 | proto | 589315 |
| EMX1 | seqs_with_primer | R2 | pam | 606283 |
| EMX1 | seqs_with_umi | R2 | proto | 492883 |
| EMX1 | seqs_with_umi | R2 | pam | 573867 |
| EMX1 | seqs_with_libbarcode | R2 | proto | 435471 |
| EMX1 | seqs_with_libbarcode | R2 | pam | 413872 |
| EMX1 | seqs_with_target | R2 | proto | 527930 |
| EMX1 | seqs_with_target | R2 | pam | 451407 |
| EMX1 | seqs_passing_all_filters | R2 | proto | 412352 |
| EMX1 | seqs_passing_all_filters | R2 | pam | 394733 |
FANCF
make_qc_reads_plot(r1_r2_qc_df, "FANCF")make_qc_table(r1_r2_qc_df, "FANCF")| target_site | var | lab | proto_pam | read_count |
|---|---|---|---|---|
| FANCF | seqs | R1 | proto | 241539 |
| FANCF | seqs | R1 | pam | 372237 |
| FANCF | seqs_with_primer | R1 | proto | 228055 |
| FANCF | seqs_with_primer | R1 | pam | 355724 |
| FANCF | seqs_with_umi | R1 | proto | 191388 |
| FANCF | seqs_with_umi | R1 | pam | 335440 |
| FANCF | seqs_with_libbarcode | R1 | proto | 172498 |
| FANCF | seqs_with_libbarcode | R1 | pam | 254841 |
| FANCF | seqs_with_target | R1 | proto | 210319 |
| FANCF | seqs_with_target | R1 | pam | 277710 |
| FANCF | seqs_passing_all_filters | R1 | proto | 160805 |
| FANCF | seqs_passing_all_filters | R1 | pam | 240830 |
| FANCF | seqs | R2 | proto | 361966 |
| FANCF | seqs | R2 | pam | 432023 |
| FANCF | seqs_with_primer | R2 | proto | 346021 |
| FANCF | seqs_with_primer | R2 | pam | 415374 |
| FANCF | seqs_with_umi | R2 | proto | 297099 |
| FANCF | seqs_with_umi | R2 | pam | 395643 |
| FANCF | seqs_with_libbarcode | R2 | proto | 266494 |
| FANCF | seqs_with_libbarcode | R2 | pam | 308255 |
| FANCF | seqs_with_target | R2 | proto | 319474 |
| FANCF | seqs_with_target | R2 | pam | 333736 |
| FANCF | seqs_passing_all_filters | R2 | proto | 252081 |
| FANCF | seqs_passing_all_filters | R2 | pam | 293366 |
RNF2
make_qc_reads_plot(r1_r2_qc_df, "RNF2")make_qc_table(r1_r2_qc_df, "RNF2")| target_site | var | lab | proto_pam | read_count |
|---|---|---|---|---|
| RNF2 | seqs | R1 | proto | 180327 |
| RNF2 | seqs | R1 | pam | 222795 |
| RNF2 | seqs_with_primer | R1 | proto | 170052 |
| RNF2 | seqs_with_primer | R1 | pam | 213663 |
| RNF2 | seqs_with_umi | R1 | proto | 141928 |
| RNF2 | seqs_with_umi | R1 | pam | 199321 |
| RNF2 | seqs_with_libbarcode | R1 | proto | 122152 |
| RNF2 | seqs_with_libbarcode | R1 | pam | 151119 |
| RNF2 | seqs_with_target | R1 | proto | 150423 |
| RNF2 | seqs_with_target | R1 | pam | 166572 |
| RNF2 | seqs_passing_all_filters | R1 | proto | 113197 |
| RNF2 | seqs_passing_all_filters | R1 | pam | 143297 |
| RNF2 | seqs | R2 | proto | 312944 |
| RNF2 | seqs | R2 | pam | 286467 |
| RNF2 | seqs_with_primer | R2 | proto | 299683 |
| RNF2 | seqs_with_primer | R2 | pam | 276410 |
| RNF2 | seqs_with_umi | R2 | proto | 249380 |
| RNF2 | seqs_with_umi | R2 | pam | 261450 |
| RNF2 | seqs_with_libbarcode | R2 | proto | 212667 |
| RNF2 | seqs_with_libbarcode | R2 | pam | 203807 |
| RNF2 | seqs_with_target | R2 | proto | 260487 |
| RNF2 | seqs_with_target | R2 | pam | 221851 |
| RNF2 | seqs_passing_all_filters | R2 | proto | 201734 |
| RNF2 | seqs_passing_all_filters | R2 | pam | 194637 |
ONE-SEQ SCORE (replicate scatter plots)
All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
########################################################
## CURATE R1 DF
########################################################
r1_combined_oneseq_score_plot_df <- r1_combined_df %>%
select(barcode, target_site, min_oneseq_score) %>%
rename(R1 = min_oneseq_score)
########################################################
## CURATE R1 DF
########################################################
r2_oneseq_score_plot_df <- r2_combined_df %>%
select(barcode, target_site, min_oneseq_score) %>%
rename(R2 = min_oneseq_score)
########################################################
## JOIN R1 AND R2 DFS
########################################################
r1_r2_oneseq_score_plot_df <- r1_combined_oneseq_score_plot_df %>%
inner_join(r2_oneseq_score_plot_df) %>%
## comparing only those seqs in common between datasets
drop_na() %>%
group_by(target_site) %>%
arrange(target_site)## Joining, by = c("barcode", "target_site")
########################################################
## SMALL VIEW OF COMBINED DF
########################################################
glimpse(r1_r2_oneseq_score_plot_df)## Rows: 72,451
## Columns: 4
## Groups: target_site [3]
## $ barcode <chr> "GCTCAGCATCGATA", "GATGCGTGTGTGAA", "CGTACTTCTGTGCA", "GAA…
## $ target_site <chr> "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "E…
## $ R1 <dbl> 1.00023867, 1.00000000, 0.91170466, 0.74217450, 0.58488012…
## $ R2 <dbl> 0.96218068, 1.00000000, 0.80655335, 0.69925173, 0.55024229…
R1 vs. R2
## FUNCTION DEFINED IN R1 SECTION
make_oneseq_scatter_plot(r1_r2_oneseq_score_plot_df, "R1", "R2")## FUNCTION DEFINED IN R1 SECTION
rm(r1_r2_oneseq_score_plot_df)NORMALIZED COUNTS (replicate scatter plots)
Compare the normalized proto & pam counts that are found in the scores.tsv files. All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. See main R1-R2 section for details on how this accomplished.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
####################### R1
## PROTO
r1_normalized_counts_proto_df <- r1_r2_scores_df %>%
select(barcode, min_norm_proto_count, target_site) %>%
rename(R1 = min_norm_proto_count)
## PAM
r1_normalized_counts_pam_df <- r1_r2_scores_df %>%
select(barcode, min_norm_pam_count, target_site) %>%
rename(R1 = min_norm_pam_count)
##################### R2
## PROTO
r2_normalized_counts_proto_df <- r1_r2_scores_df %>%
select(barcode, min_norm_proto_count, target_site) %>%
rename(R2 = min_norm_proto_count)
## PAM
r2_normalized_counts_pam_df <- r1_r2_scores_df %>%
select(barcode, min_norm_pam_count, target_site) %>%
rename(R2 = min_norm_pam_count)
#############################################
## JOIN R1/R2 DFS
#############################################
r1_r2_normalized_counts_pam_df <- r1_normalized_counts_pam_df %>%
left_join(r2_normalized_counts_pam_df)
r1_r2_normalized_counts_proto_df <- r1_normalized_counts_proto_df %>%
left_join(r2_normalized_counts_proto_df)R1 vs. R2
## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(r1_r2_normalized_counts_proto_df, "R1", "R2")make.pam.norm.plot(r1_r2_normalized_counts_pam_df, "R1", "R2")## FUNCTION DEFINED IN R1 SECTION
rm(r1_r2_normalized_counts_proto_df)
rm(r1_r2_normalized_counts_pam_df)RAW COUNTS (replicate scatter plots)
Compare the raw proto & pam counts that are calculated from the scores.tsv files. All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
NOTE only barcodes present in both samples are plotted.
## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
####################### R1
## PROTO
r1_raw_counts_proto_df <- r1_r2_scores_df %>%
select(barcode, min_proto_raw_read_count, target_site) %>%
rename(R1 = min_proto_raw_read_count)
## PAM
r1_raw_counts_pam_df <- r1_r2_scores_df %>%
select(barcode, min_pam_raw_read_count, target_site) %>%
rename(R1 = min_pam_raw_read_count)
####################### R2
## PROTO
r2_raw_counts_proto_df <- r1_r2_scores_df %>%
select(barcode, min_proto_raw_read_count, target_site) %>%
rename(R2 = min_proto_raw_read_count)
## PAM
r2_raw_counts_pam_df <- r1_r2_scores_df %>%
select(barcode, min_pam_raw_read_count, target_site) %>%
rename(R2 = min_pam_raw_read_count)
#############################################
## JOIN R1 AND R2 DFS
#############################################
r1_r2_raw_counts_pam_df <- r1_raw_counts_pam_df %>%
left_join(r2_raw_counts_pam_df)
r1_r2_raw_counts_proto_df <- r1_raw_counts_proto_df %>%
left_join(r2_raw_counts_proto_df)R1 vs. R2
## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(r1_r2_raw_counts_proto_df, "R1", "R2")make.pam.raw.counts.plot(r1_r2_raw_counts_pam_df, "R1", "R2")#############################################
## REMOVE OBJETCS
#############################################
rm(r1_r2_raw_counts_proto_df)
rm(r1_r2_raw_counts_pam_df)VENN BAR: PER TARGET BARCODE COMPARISON
TRIPLICATES COMBINED
All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
cbPalette <- c("#E69F00",
"#56B4E9",
"#000000")
#############################################
## DF CURATION
#############################################
r1_venn_df <- r1_combined_df %>%
select(barcode, target_site, dataset)
r2_venn_df <- r2_combined_df %>%
select(barcode, target_site, dataset)
#############################################
## DF JOIN
#############################################
r1_r2_venn_df <- plyr::rbind.fill(r1_venn_df, r2_venn_df) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))
#############################################
## PLOT
#############################################
ggplotly(ggplot(data = r1_r2_venn_df) +
geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set),
# position = "fill"
position = "fill") +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Lab Combinations",
title = "R1 vs. R2 Replicate Barcodes") +
scale_fill_manual(values=cbPalette))#############################################
## TABLE
#############################################
(r1_r2_venn_table_df <- r1_r2_venn_df %>%
group_by(target_site) %>%
count(lab_set) %>%
group_by(target_site) %>%
mutate(target_site_total = sum(n)) %>%
mutate(pct = round(n/target_site_total * 100, 0),
count_value = paste0(n, " (", pct, "%)")) %>%
select(-n, -pct) %>%
pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | target_site_total | R1 | R1-R2 | R2 |
|---|---|---|---|---|
| EMX1 | 48558 | 5950 (12%) | 35712 (74%) | 6896 (14%) |
| FANCF | 26102 | 5330 (20%) | 16814 (64%) | 3958 (15%) |
| RNF2 | 25204 | 1942 (8%) | 19925 (79%) | 3337 (13%) |
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT
########################################################################
rm(r1_venn_df)
rm(r2_venn_df)BARCODE CONCORDANCE TABLE
Table that breaks down the number of barcodes that are:
- Not concordant (present in only 1 replicate)
- Concordant between 2 replicates
- Concordant in all 3 replicates
NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.
########################################################################
# DF CURATION
########################################################################
r1_temp_df <- r1_combined_df %>%
select(barcode, target_site, min_proto_raw_read_count,
min_pam_raw_read_count, dataset)
r2_temp_df <- r2_combined_df %>%
select(barcode, target_site, min_proto_raw_read_count,
min_pam_raw_read_count, dataset)
########################################################################
# JIN DFS
########################################################################
r1_r2_raw_df <- plyr::rbind.fill(r1_temp_df, r2_temp_df)
########################################################################
# FUNCTION
########################################################################
make_concordance_table <- function(df, pam_proto){
if(pam_proto == "proto"){
(df %>%
select(barcode, target_site, dataset, min_proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(min_proto_raw_read_count > 0) %>%
select(-min_proto_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
group_by(target_site, lab_num) %>%
tally() %>%
mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
pivot_wider(names_from = lab_num,
values_from = n,
values_fill = NA) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed",
"responsive", html_font = "Cambria")))
} else if(pam_proto == "pam"){
(df %>%
select(barcode, target_site, dataset, min_pam_raw_read_count) %>%
## include only those with reads of at least 1
filter(min_pam_raw_read_count > 0) %>%
select(-min_pam_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
group_by(target_site, lab_num) %>%
tally() %>%
mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
pivot_wider(names_from = lab_num,
values_from = n,
values_fill = NA) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed",
"responsive", html_font = "Cambria")))
} else {
message("Enter 'pam' or 'proto' for second arg")
}
}
########################################################################
# REMOVE ENV OBJECTS
########################################################################
rm(r1_temp_df)
rm(r2_temp_df)PROTO
make_concordance_table(r1_r2_raw_df, "proto")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 |
|---|---|---|
| EMX1 | 19223 | 12945 |
| FANCF | 10265 | 6076 |
| RNF2 | 10509 | 7600 |
PAM
make_concordance_table(r1_r2_raw_df, "pam")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
| target_site | concordant_in_1 | concordant_in_2 |
|---|---|---|
| EMX1 | 16593 | 27731 |
| FANCF | 10546 | 12324 |
| RNF2 | 7820 | 15748 |
########################################################################
# REMOVE ENV OBJECTS
########################################################################
rm(r1_r2_raw_df)VENN BAR: 100% TRIPLICATE CONCORDANCE
Only barcodes that are present with at least 1 read count across all triplicates are considered. NOTE no count values are taken into consideration.
########################################################################
# COLOR PALETTE TO BE CONSISTENT WITH PREVIOUS SCHEME
########################################################################
cbPalette2 <- c("#E69F00",
"#56B4E9",
"#000000")
########################################################################
# DF CURATION
########################################################################
r1 <- r1_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(proto_raw_read_count > 0) %>%
select(-proto_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
filter(lab_num == 3) %>%
select(-lab_set) %>%
mutate(dataset = "R1")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
r2 <- r2_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(proto_raw_read_count > 0) %>%
select(-proto_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
filter(lab_num == 3) %>%
select(-lab_set) %>%
mutate(dataset = "R2")## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
########################################################################
# JOIN DFS
########################################################################
r1_r2 <- plyr::rbind.fill(r1, r2) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
########################################################################
# PLOT
########################################################################
ggplotly(ggplot(data = r1_r2) +
geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set),
# position = "fill"
position = "fill") +
coord_flip() +
theme_bw() +
labs(x = "Target Site",
y = "Proportion of Sites",
fill = "Lab Combinations",
title = "R1 vs. R2 Replicate Barcodes") +
scale_fill_manual(values=cbPalette2))########################################################################
# TABLE
########################################################################
(r1_r2_venn_table_df <- r1_r2 %>%
group_by(target_site) %>%
count(lab_set) %>%
group_by(target_site) %>%
mutate(target_site_total = sum(n)) %>%
mutate(pct = round(n/target_site_total * 100, 0),
count_value = paste0(n, " (", pct, "%)")) %>%
select(-n, -pct) %>%
pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))| target_site | target_site_total | R1 | R1-R2 | R2 |
|---|---|---|---|---|
| EMX1 | 3927 | 1316 (34%) | 1668 (42%) | 943 (24%) |
| FANCF | 1716 | 246 (14%) | 929 (54%) | 541 (32%) |
| RNF2 | 1632 | 396 (24%) | 572 (35%) | 664 (41%) |
########################################################################
# REMOVE ENV OBJECTS
########################################################################
rm(r1)
rm(r2)
rm(r1_r2)
rm(r1_r2_venn_table_df)DENSITY PLOTS
TRIPLICATES COMBINED
Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.
########################################################################
# CURATE R1 - R2 DFS
########################################################################
r1_raw_reads_df <- r1_combined_df %>%
select(barcode, target_site, min_proto_raw_read_count,
min_pam_raw_read_count, dataset) %>%
rename(lab = dataset,
proto_raw_read_count = min_proto_raw_read_count,
pam_raw_read_count = min_pam_raw_read_count)
r2_raw_reads_df <- r2_combined_df %>%
select(barcode, target_site, min_pam_raw_read_count,
min_proto_raw_read_count, dataset) %>%
rename(lab = dataset,
proto_raw_read_count = min_proto_raw_read_count,
pam_raw_read_count = min_pam_raw_read_count)
########################################################################
# JOIN DFS
########################################################################
r1_r2_raw_counts_df <- plyr::rbind.fill(r1_raw_reads_df, r2_raw_reads_df)
########################################################################
# REMOVE ENV OBJECTS
########################################################################
rm(r1_raw_reads_df)
rm(r2_raw_reads_df)PROTO
## FUNCTIONS DEFINED IN R1 SECTION
lab_list <- c("R1", "R2")
ggplotly(make.proto.density.plots(r1_r2_raw_counts_df, lab_list))PAM
## FUNCTIONS DEFINED IN R1 SECTION
lab_list <- c("R1", "R2")
ggplotly(make.pam.density.plots(r1_r2_raw_counts_df, lab_list))DENSITY PLOT : 100% TRIPLICATE CONCORDANCE
Only barcodes that are present with at least 1 read count across all triplicates are considered.
##############################################
## GET BARCODES OF 100% CONCORDANT TRIPLICATE
##############################################
r1 <- r1_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(proto_raw_read_count > 0) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
filter(lab_num == 3) %>%
select(-lab_set) %>%
mutate(dataset = "R1")
r2 <- r2_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
## include only those with reads of at least 1
filter(proto_raw_read_count > 0) %>%
select(-proto_raw_read_count) %>%
group_by(target_site, barcode) %>%
## get lab set to figure out how many labs each barcode is present in
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-")) %>%
mutate(lab_num = str_count(lab_set, "-") + 1) %>%
filter(lab_num == 3) %>%
select(-lab_set) %>%
mutate(dataset = "R2")
##############################################
## GENERATE MIN COUNT DFS
##############################################
##################### PROTO #################
r1_concordant_proto_counts <- r1_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
pivot_wider(names_from = lab,
values_from = proto_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(proto_raw_read_count = pmin(NNN03m,
NNN04m,
NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m) %>%
filter(barcode %in% r1$barcode) %>%
mutate(lab = "R1")
r2_concordant_proto_counts <- r2_scores_df %>%
select(barcode, target_site, lab, proto_raw_read_count) %>%
pivot_wider(names_from = lab,
values_from = proto_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(proto_raw_read_count = pmin(NNN06m,
NNN07m,
NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m) %>%
filter(barcode %in% r2$barcode) %>%
mutate(lab = "R2")
##################### PAM #################
r1_concordant_pam_counts <- r1_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count) %>%
pivot_wider(names_from = lab,
values_from = pam_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(pam_raw_read_count = pmin(NNN03m,
NNN04m,
NNN05m,
na.rm=TRUE)) %>%
select(-NNN03m, -NNN04m, -NNN05m) %>%
filter(barcode %in% r1$barcode) %>%
mutate(lab = "R1")
r2_concordant_pam_counts <- r2_scores_df %>%
select(barcode, target_site, lab, pam_raw_read_count) %>%
pivot_wider(names_from = lab,
values_from = pam_raw_read_count,
values_fill = NA) %>%
## make all 0's NA so it isn't considered in the pmin function
mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
mutate(pam_raw_read_count = pmin(NNN06m,
NNN07m,
NNN08m,
na.rm=TRUE)) %>%
select(-NNN06m, -NNN07m, -NNN08m) %>%
filter(barcode %in% r2$barcode) %>%
mutate(lab = "R2")
##############################################
## GET 1 R1 R2 DF
##############################################
r1_r2_concordant_proto_counts <- plyr::rbind.fill(r1_concordant_proto_counts,
r2_concordant_proto_counts)
r1_r2_concordant_pam_counts <- plyr::rbind.fill(r1_concordant_pam_counts,
r2_concordant_pam_counts)
########################################################################
# REMOVE ENV OBJECTS
########################################################################
rm(r1)
rm(r2)
rm(r1_concordant_proto_counts)
rm(r2_concordant_proto_counts)
rm(r1_concordant_pam_counts)
rm(r2_concordant_pam_counts)PROTO
lab_list <- c("R1", "R2")
ggplotly(make.proto.density.plots(r1_r2_concordant_proto_counts, lab_list))PAM
lab_list <- c("R1", "R2")
ggplotly(make.pam.density.plots(r1_r2_concordant_pam_counts, lab_list))DENSITY PLOT: COMBINED PROTO & PAM
##############################################
## DF CURATION
##############################################
r1_r2_combined_proto_pam_df <- r1_r2_raw_counts_df %>%
mutate(min_proto_pam_raw_count = pmin(proto_raw_read_count,
pam_raw_read_count,
na.rm=TRUE)) %>%
select(-proto_raw_read_count, -pam_raw_read_count)
##############################################
## FUNCTION
##############################################
make.combined.density.plots <- function(df, lablist){
labs <- c(lablist)
density_plot_labset_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site) %>%
group_by(target_site, barcode) %>%
summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
proto_density_plot_df <- df %>%
filter(lab %in% labs) %>%
select(barcode, lab, target_site, min_proto_pam_raw_count) %>%
right_join(density_plot_labset_df) %>%
count(target_site, lab_set, min_proto_pam_raw_count, name = "Occurrence")
options(scipen = 999)
plot <- ggplot(proto_density_plot_df) +
geom_histogram(aes(x = min_proto_pam_raw_count, fill = lab_set),
position = "fill") +
scale_x_log10() +
facet_wrap(~target_site, scales = "free_y", ncol = 1) +
theme_bw() +
scale_fill_brewer(type = "qual") +
labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
scale_fill_manual(values=cbPalette)
return(plot)
}
##############################################
## REMOVE ENV OBJECTS
##############################################lab_list <- c("R1", "R2")
make.combined.density.plots(r1_r2_combined_proto_pam_df, lab_list)